This document aims to provide further examples in how to use the hpgltools.
Note to self, the header has rmarkdown::pdf_document instead of html_document or html_vignette because it gets some bullcrap error ‘margins too large’…
Here are the commands I invoke to get ready to play with new data, including everything required to install hpgltools, the software it uses, and the fission data.
library(hpgltools)
tt <- sm(library(fission))
tt <- data(fission)
Later on in this, I will do some ontology shenanigans. But I can grab some annotations from biomart now.
pombe_annotations <- load_biomart_annotations(
host = "fungi.ensembl.org",
trymart = "fungal_mart",
trydataset = "spombe_eg_gene",
gene_requests = c("pombase_transcript", "ensembl_gene_id", "ensembl_transcript_id",
"hgnc_symbol", "description", "gene_biotype"),
species = "spombe", overwrite = TRUE)
## Unable to perform useMart, perhaps the host/mart is incorrect: fungi.ensembl.org fungal_mart.
## The available marts are:
## fungi_mart, fungi_variations
## Trying the first one.
## Successfully connected to the spombe_eg_gene database.
## Some attributes in your request list were not in the ensembl database.
## Finished downloading ensembl gene annotations.
## Finished downloading ensembl structure annotations.
## Dropping haplotype chromosome annotations, set drop_haplotypes = FALSE if this is bad.
## Saving annotations to spombe_biomart_annotations.rda.
## Finished save().
pombe_mart <- pombe_annotations[["mart"]]
annotations <- pombe_annotations[["annotation"]]
rownames(annotations) <- make.names(gsub(pattern = "\\.\\d+$",
replacement = "",
x = rownames(annotations)), unique = TRUE)
All the work I do in Dr. El-Sayed’s lab makes some pretty hard assumptions about how data is stored. As a result, to use the fission data set I will do a little bit of shenanigans to match it to the expected format. Now that I have played a little with fission, I think its format is quite nice and am likely to have my experiment class instead be a SummarizedExperiment.
## Extract the meta data from the fission dataset
meta <- as.data.frame(fission@colData)
## Make conditions and batches
meta[["condition"]] <- paste(meta$strain, meta$minute, sep = ".")
meta[["batch"]] <- meta[["replicate"]]
meta[["sample.id"]] <- rownames(meta)
## Grab the count data
fission_data <- fission@assays[["data"]][["counts"]]
## This will make an experiment superclass called 'expt' and it contains
## an ExpressionSet along with any arbitrary additional information one might want to include.
## Along the way it writes a Rdata file which is by default called 'expt.Rdata'
fission_expt <- create_expt(metadata = meta,
count_dataframe = fission_data,
gene_info = annotations)
## Reading the sample metadata.
## The sample definitions comprises: 36 rows(samples) and 7 columns(metadata fields).
## Matched 5710 annotations and counts.
## Bringing together the count matrix and gene information.
## Some annotations were lost in merging, setting them to 'undefined'.
## Saving the expressionset to 'expt.rda'.
## The final expressionset has 7039 rows and 36 columns.
Travis wisely imposes a limit on the amount of time for building vignettes. My tools by default will attempt all possible pairwise comparisons, which takes a long time. Therefore I am going to take a subset of the data and limit these comparisons to that.
fun_data <- subset_expt(fission_expt,
subset = "condition=='wt.120'|condition=='wt.30'")
## There were 36, now there are 6 samples.
fun_filt <- normalize_expt(fun_data, filter = "simple")
## Removing 462 low-count genes (6577 remaining).
fun_norm <- sm(normalize_expt(fun_filt, batch = "limma", norm = "quant",
transform = "log2", convert = "cpm"))
limma_comparison <- sm(limma_pairwise(fun_data))
names(limma_comparison[["all_tables"]])
## [1] "wt30_vs_wt120"
summary(limma_comparison[["all_tables"]][["wt30_vs_wt120"]])
## logFC AveExpr t P.Value
## Min. :-4.278 Min. :-4.58 Min. :-88.48 Min. :0.0000
## 1st Qu.:-0.399 1st Qu.: 1.11 1st Qu.: -2.60 1st Qu.:0.0192
## Median :-0.020 Median : 3.97 Median : -0.13 Median :0.1240
## Mean : 0.008 Mean : 3.11 Mean : -0.17 Mean :0.2792
## 3rd Qu.: 0.300 3rd Qu.: 5.44 3rd Qu.: 1.72 3rd Qu.:0.4653
## Max. : 7.075 Max. :18.59 Max. : 62.44 Max. :1.0000
## adj.P.Val B
## Min. :0.0170 Min. :-8.29
## 1st Qu.:0.0767 1st Qu.:-6.58
## Median :0.2479 Median :-5.50
## Mean :0.3686 Mean :-4.87
## 3rd Qu.:0.6204 3rd Qu.:-3.50
## Max. :1.0000 Max. : 4.83
scatter_wt_mut <- extract_coefficient_scatter(limma_comparison, type = "limma",
x = "wt30", y = "wt120")
## This can do comparisons among the following columns in the pairwise result:
## wt120, wt30
## Actually comparing wt30 and wt120.
scatter_wt_mut[["scatter"]]
scatter_wt_mut[["both_histogram"]][["plot"]] +
ggplot2::scale_y_continuous(limits = c(0, 0.20))
## Warning: Removed 7039 rows containing non-finite values (stat_bin).
## Warning: Removed 7039 rows containing non-finite values (stat_density).
## Warning: Removed 1 rows containing missing values (geom_vline).
ma_wt_mut <- extract_de_plots(limma_comparison, type = "limma")
ma_wt_mut[["ma"]][["plot"]]
ma_wt_mut[["volcano"]][["plot"]]
deseq_comparison <- sm(deseq2_pairwise(fun_data))
summary(deseq_comparison[["all_tables"]][["wt30_vs_wt120"]])
## baseMean logFC lfcSE stat
## Min. : 0 Min. :-5.615 Min. :0.000 Min. :-20.800
## 1st Qu.: 28 1st Qu.:-0.386 1st Qu.:0.168 1st Qu.: -1.176
## Median : 192 Median : 0.000 Median :0.222 Median : 0.000
## Mean : 1703 Mean : 0.020 Mean :0.489 Mean : 0.168
## 3rd Qu.: 536 3rd Qu.: 0.343 3rd Qu.:0.412 3rd Qu.: 1.108
## Max. :4924000 Max. : 7.212 Max. :4.072 Max. : 30.370
## P.Value adj.P.Val
## Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0197 1st Qu.:0.0685
## Median :0.2503 Median :0.4676
## Mean :0.3600 Mean :0.4805
## 3rd Qu.:0.6666 3rd Qu.:0.8732
## Max. :1.0000 Max. :1.0000
scatter_wt_mut <- extract_coefficient_scatter(deseq_comparison, type = "deseq",
x = "wt30", y = "wt120", gvis_filename = NULL)
## This can do comparisons among the following columns in the pairwise result:
## wt120, wt30, r2, r3
## Actually comparing wt30 and wt120.
scatter_wt_mut[["scatter"]]
plots_wt_mut <- extract_de_plots(deseq_comparison, type = "deseq")
plots_wt_mut[["ma"]][["plot"]]
plots_wt_mut[["volcano"]][["plot"]]
edger_comparison <- sm(edger_pairwise(fun_data, model_batch = TRUE))
plots_wt_mut <- extract_de_plots(edger_comparison, type = "edger")
scatter_wt_mut <- extract_coefficient_scatter(edger_comparison, type = "edger",
x = "wt30", y = "wt120", gvis_filename = NULL)
## This can do comparisons among the following columns in the pairwise result:
## wt120, wt30
## Actually comparing wt30 and wt120.
scatter_wt_mut[["scatter"]]
plots_wt_mut[["ma"]][["plot"]]
plots_wt_mut[["volcano"]][["plot"]]
{r simple_edger2 ebseq_comparison <- sm(ebseq_pairwise(fun_data)) head(ebseq_comparison$all_tables[[1]])
basic_comparison <- sm(basic_pairwise(fun_data))
summary(basic_comparison$all_tables$wt30_vs_wt120)
## numerator_mean denominator_mean numerator_var denominator_var
## Min. :-2.20 Min. :-3.20 Min. :0.000 Min. :0.000
## 1st Qu.: 3.29 1st Qu.: 3.29 1st Qu.:0.019 1st Qu.:0.010
## Median : 4.63 Median : 4.63 Median :0.053 Median :0.028
## Mean : 4.70 Mean : 4.70 Mean :0.134 Mean :0.073
## 3rd Qu.: 5.92 3rd Qu.: 5.93 3rd Qu.:0.134 3rd Qu.:0.074
## Max. :18.61 Max. :18.61 Max. :9.509 Max. :5.907
## t p logFC adjp
## Min. :-50.21 Min. :0.0000 Min. :-4.073 Min. :0.0045
## 1st Qu.: -2.10 1st Qu.:0.0327 1st Qu.:-0.399 1st Qu.:0.1307
## Median : -0.39 Median :0.1602 Median :-0.077 Median :0.3203
## Mean : -0.16 Mean :0.2766 Mean : 0.000 Mean :0.3876
## 3rd Qu.: 1.53 3rd Qu.:0.4585 3rd Qu.: 0.278 3rd Qu.:0.6113
## Max. : 49.10 Max. :0.9996 Max. : 7.084 Max. :0.9996
scatter_wt_mut <- extract_coefficient_scatter(basic_comparison, type = "basic",
x = "wt30", y = "wt120")
## This can do comparisons among the following columns in the pairwise result:
## wt120, wt30
## Actually comparing wt30 and wt120.
scatter_wt_mut[["scatter"]]
plots_wt_mut <- extract_de_plots(basic_comparison, type = "basic")
plots_wt_mut[["ma"]][["plot"]]
plots_wt_mut[["volcano"]][["plot"]]
all_comparisons <- sm(all_pairwise(fun_data, model_batch = TRUE, parallel = FALSE))
all_combined <- sm(combine_de_tables(all_comparisons, excel = FALSE))
head(all_combined$data[[1]], n = 3)
## ensembltranscriptid pombasetranscript ensemblgeneid
## SPAC1002.01 SPAC1002.01.1 SPAC1002.01.1 SPAC1002.01
## SPAC1002.02 SPAC1002.02.1 SPAC1002.02.1 SPAC1002.02
## SPAC1002.03c SPAC1002.03c.1 SPAC1002.03c.1 SPAC1002.03c
## description
## SPAC1002.01 conserved fungal protein [Source:PomBase;Acc:SPAC1002.01]
## SPAC1002.02 nucleoporin Pom34 [Source:PomBase;Acc:SPAC1002.02]
## SPAC1002.03c glucosidase II alpha subunit Gls2 [Source:PomBase;Acc:SPAC1002.03c]
## genebiotype cdslength chromosomename strand startposition
## SPAC1002.01 protein_coding 540 I + 1798347
## SPAC1002.02 protein_coding 690 I + 1799061
## SPAC1002.03c protein_coding 2772 I - 1799915
## endposition deseq_logfc deseq_adjp edger_logfc edger_adjp
## SPAC1002.01 1799015 -1.08000 0.3664 -1.05900 0.2201
## SPAC1002.02 1800053 -0.01485 0.9816 -0.02342 1.0000
## SPAC1002.03c 1803141 -0.22760 0.2327 -0.23630 0.1598
## limma_logfc limma_adjp basic_nummed basic_denmed basic_numvar
## SPAC1002.01 -0.99860 0.16930 0.000 0.000 0.000000
## SPAC1002.02 0.03778 0.99460 2.860 2.856 0.360293
## SPAC1002.03c -0.33910 0.02432 6.916 7.252 0.002955
## basic_denvar basic_logfc basic_t basic_p basic_adjp
## SPAC1002.01 0.000000 0.000000 0.00000 0.000000 0.00000
## SPAC1002.02 0.028898 0.004047 0.01124 0.991929 0.99611
## SPAC1002.03c 0.001016 -0.336700 -9.25600 0.001969 0.03734
## deseq_basemean deseq_lfcse deseq_stat deseq_p ebseq_fc ebseq_logfc
## SPAC1002.01 11.15 0.8209 -1.31600 0.1882 0.5391 -0.89135
## SPAC1002.02 87.42 0.3316 -0.04479 0.9643 1.0571 0.08007
## SPAC1002.03c 1621.00 0.1387 -1.64100 0.1008 0.8580 -0.22094
## ebseq_c1mean ebseq_c2mean ebseq_mean ebseq_var ebseq_postfc
## SPAC1002.01 14.82 7.987 11.41 33.89 0.5554
## SPAC1002.02 86.42 91.351 88.88 631.94 1.0567
## SPAC1002.03c 1763.03 1512.690 1637.86 36877.63 0.8581
## ebseq_ppee ebseq_ppde ebseq_adjp edger_logcpm edger_lr edger_p
## SPAC1002.01 0.4683 0.53174 0.4683 0.06691 2.745000 0.09757
## SPAC1002.02 0.9177 0.08227 0.9177 2.89400 0.007429 0.93130
## SPAC1002.03c 0.6947 0.30531 0.6947 7.09500 3.399000 0.06522
## limma_ave limma_t limma_b limma_p limma_adjp_ihw deseq_adjp_ihw
## SPAC1002.01 -0.1955 -2.8320 -4.0790 0.07147 1.000e+00 1.000e+00
## SPAC1002.02 2.8470 0.1354 -7.4050 0.90140 1.000e+00 8.822e-01
## SPAC1002.03c 7.0770 -12.5400 -0.6495 0.00151 1.709e-02 2.626e-01
## edger_adjp_ihw ebseq_adjp_ihw basic_adjp_ihw lfc_meta lfc_var
## SPAC1002.01 4.739e-01 1.000e+00 0.000e+00 -1.0520000 7.689e-04
## SPAC1002.02 9.141e-01 1.000e+00 1.000e+00 0.0007106 3.191e-03
## SPAC1002.03c 1.334e-01 2.918e-01 5.214e-03 -0.2681000 2.166e-03
## lfc_varbymed p_meta p_var
## SPAC1002.01 -7.305e-04 1.191e-01 3.753e-03
## SPAC1002.02 4.491e+00 9.323e-01 9.899e-04
## SPAC1002.03c -8.081e-03 5.584e-02 2.531e-03
sig_genes <- sm(extract_significant_genes(all_combined, excel = FALSE))
head(sig_genes$limma$ups[[1]], n = 3)
## ensembltranscriptid pombasetranscript ensemblgeneid
## SPBC2F12.09c SPBC2F12.09c.1 SPBC2F12.09c.1 SPBC2F12.09c
## SPAC22A12.17c SPAC22A12.17c.1 SPAC22A12.17c.1 SPAC22A12.17c
## SPAPB1A11.02 SPAPB1A11.02.1 SPAPB1A11.02.1 SPAPB1A11.02
## description
## SPBC2F12.09c transcription factor, Atf-CREB family Atf21 [Source:PomBase;Acc:SPBC2F12.09c]
## SPAC22A12.17c short chain dehydrogenase (predicted) [Source:PomBase;Acc:SPAC22A12.17c]
## SPAPB1A11.02 esterase/lipase (predicted) [Source:PomBase;Acc:SPAPB1A11.02]
## genebiotype cdslength chromosomename strand startposition
## SPBC2F12.09c protein_coding 1068 II + 1722231
## SPAC22A12.17c protein_coding 786 I - 1185837
## SPAPB1A11.02 protein_coding 1020 I + 2980428
## endposition deseq_logfc deseq_adjp edger_logfc edger_adjp
## SPBC2F12.09c 1724450 7.212 5.259e-66 7.170 1.264e-180
## SPAC22A12.17c 1189506 5.855 3.969e-19 5.822 3.155e-57
## SPAPB1A11.02 2981839 6.739 1.894e-06 6.483 1.257e-14
## limma_logfc limma_adjp basic_nummed basic_denmed basic_numvar
## SPBC2F12.09c 7.075 0.01847 6.174 -0.9106 0.02211
## SPAC22A12.17c 5.609 0.02447 9.396 3.6010 0.02236
## SPAPB1A11.02 5.606 0.01696 1.648 -3.1970 0.68686
## basic_denvar basic_logfc basic_t basic_p basic_adjp
## SPBC2F12.09c 0.5967 7.084 15.600 0.003004 0.04238
## SPAC22A12.17c 0.9694 5.795 10.080 0.008325 0.06433
## SPAPB1A11.02 0.4981 4.844 7.708 0.001687 0.03455
## deseq_basemean deseq_lfcse deseq_stat deseq_p ebseq_fc
## SPBC2F12.09c 443.5 0.4123 17.490 1.667e-68 143.63
## SPAC22A12.17c 4289.0 0.6255 9.360 7.945e-21 52.82
## SPAPB1A11.02 21.2 1.2810 5.259 1.447e-07 103.34
## ebseq_logfc ebseq_c1mean ebseq_c2mean ebseq_mean ebseq_var
## SPBC2F12.09c 7.166 6.2270 895.83 451.03 2.477e+05
## SPAC22A12.17c 5.723 161.9714 8556.19 4359.08 2.225e+07
## SPAPB1A11.02 6.691 0.4049 42.86 21.63 8.155e+02
## ebseq_postfc ebseq_ppee ebseq_ppde ebseq_adjp edger_logcpm
## SPBC2F12.09c 132.23 0 1 0 5.2210
## SPAC22A12.17c 52.65 0 1 0 8.4930
## SPAPB1A11.02 45.38 0 0 0 0.9166
## edger_lr edger_p limma_ave limma_t limma_b limma_p
## SPBC2F12.09c 839.00 1.796e-184 2.592 19.36 0.8017 0.0004519
## SPAC22A12.17c 264.90 1.479e-59 6.482 12.49 -0.3953 0.0015260
## SPAPB1A11.02 65.83 4.910e-16 -1.192 27.66 0.2698 0.0001667
## limma_adjp_ihw deseq_adjp_ihw edger_adjp_ihw ebseq_adjp_ihw
## SPBC2F12.09c 1.386e-02 4.625e-66 1.076e-180 1.000e+00
## SPAC22A12.17c 1.716e-02 3.792e-19 2.516e-57 7.778e-01
## SPAPB1A11.02 1.000e+00 7.571e-06 1.989e-14 0.000e+00
## basic_adjp_ihw lfc_meta lfc_var lfc_varbymed p_meta
## SPBC2F12.09c 1.263e-02 7.152 0.000e+00 0.000e+00 1.506e-04
## SPAC22A12.17c 1.865e-02 5.916 1.123e-01 1.898e-02 5.087e-04
## SPAPB1A11.02 6.937e-01 6.137 5.880e-02 9.581e-03 5.561e-05
## p_var
## SPBC2F12.09c 6.807e-08
## SPAC22A12.17c 7.762e-07
## SPAPB1A11.02 9.255e-09
## Here we see that edger and deseq agree the least:
all_comparisons[["comparison"]][["comp"]]
## wt30_vs_wt120
## limma_vs_deseq 0.9808
## limma_vs_edger 0.9601
## limma_vs_ebseq 0.7944
## limma_vs_basic 0.9977
## deseq_vs_edger 0.9814
## deseq_vs_ebseq 0.8329
## deseq_vs_basic 0.9965
## edger_vs_ebseq 0.9165
## edger_vs_basic 0.9972
## ebseq_vs_basic 0.9950
## And here we can look at the set of 'significant' genes according to various tools:
yeast_sig <- sm(extract_significant_genes(all_combined, excel = FALSE))
yeast_barplots <- sm(significant_barplots(combined = all_combined))
yeast_barplots[["limma"]]
yeast_barplots[["edger"]]
yeast_barplots[["deseq"]]
Since I didn’t acquire this data in a ‘normal’ way, I am going to post-generate a gff file which may be used by clusterprofiler, topgo, and gostats.
Therefore, I am going to make use of TxDb to make the requisite gff file.
limma_results <- limma_comparison[["all_tables"]]
## The set of comparisons performed
names(limma_results)
## [1] "wt30_vs_wt120"
table <- limma_results[["wt30_vs_wt120"]]
dim(table)
## [1] 7039 6
gene_names <- rownames(table)
updown_genes <- get_sig_genes(table, p = 0.05, lfc = 0.4, p_column = "P.Value")
tt <- please_install("GenomicFeatures")
tt <- please_install("biomaRt")
available_marts <- biomaRt::listMarts(host = "fungi.ensembl.org")
available_marts
## biomart version
## 1 fungi_mart Ensembl Fungi Genes 50
## 2 fungi_variations Ensembl Fungi Variations 50
ensembl_mart <- biomaRt::useMart("fungi_mart", host = "fungi.ensembl.org")
available_datasets <- biomaRt::listDatasets(ensembl_mart)
pombe_hit <- grep(pattern = "pombe", x = available_datasets[["description"]])
pombe_name <- available_datasets[pombe_hit, "dataset"]
pombe_mart <- biomaRt::useDataset(pombe_name, mart = ensembl_mart)
pombe_goids <- biomaRt::getBM(attributes = c("pombase_transcript", "go_id"),
values = gene_names, mart = pombe_mart)
colnames(pombe_goids) <- c("ID", "GO")
The above worked, it provided a table of ID and ontology. It was however a bit fraught. Here is another way.
## In theory, the above should work with a single function call:
pombe_goids_simple <- load_biomart_go(species = "spombe", overwrite = TRUE,
dl_rows = c("pombase_transcript", "go_id"),
host = "fungi.ensembl.org")
## Unable to perform useMart, perhaps the host/mart is incorrect: fungi.ensembl.org ENSEMBL_MART_ENSEMBL.
## The available marts are:
## fungi_mart, fungi_variations
## Trying the first one.
## Unable to perform useMart, perhaps the host/mart is incorrect: fungi.ensembl.org ENSEMBL_MART_ENSEMBL.
## The available marts are:
## fungi_martfungi_variations
## Trying the first one.
## Unable to perform useDataset, perhaps the given dataset is incorrect: spombe_gene_ensembl.
## Trying instead to use the dataset: spombe_eg_gene
## That seems to have worked, extracting the resulting annotations.
## Finished downloading ensembl go annotations, saving to spombe_go_annotations.rda.
## Saving ontologies to spombe_go_annotations.rda.
## Finished save().
head(pombe_goids_simple[["go"]])
## ID GO
## 1 SPRRNA.50.1
## 2 SPNCRNA.1095.1
## 3 SPAC212.11.1 GO:0000784
## 4 SPAC212.11.1 GO:0005634
## 5 SPAC212.11.1 GO:0000166
## 6 SPAC212.11.1 GO:0005524
head(pombe_goids)
## ID GO
## 1 SPRRNA.50.1
## 2 SPNCRNA.1095.1
## 3 SPAC212.11.1 GO:0000784
## 4 SPAC212.11.1 GO:0005634
## 5 SPAC212.11.1 GO:0000166
## 6 SPAC212.11.1 GO:0005524
## This used to work, but does so no longer and I do not know why.
## pombe <- sm(GenomicFeatures::makeTxDbFromBiomart(biomart = "fungal_mart",
## dataset = "spombe_eg_gene",
## host = "fungi.ensembl.org"))
## I bet I can get all this information from ensembl now.
## This was found at the bottom of: https://www.biostars.org/p/232005/
link <- "ftp://ftp.ensemblgenomes.org/pub/release-34/fungi/gff3/schizosaccharomyces_pombe/Schizosaccharomyces_pombe.ASM294v2.34.gff3.gz"
pombe <- GenomicFeatures::makeTxDbFromGFF(link, format = "gff3", taxonomyId = 4896,
organism = "Schizosaccharomyces pombe")
## Import genomic features from the file as a GRanges object ...
## OK
## Prepare the 'metadata' data frame ... OK
## Make the TxDb object ... OK
pombe_transcripts <- as.data.frame(GenomicFeatures::transcriptsBy(pombe))
lengths <- pombe_transcripts[, c("group_name","width")]
colnames(lengths) <- c("ID","width")
## Something useful I didn't notice before:
## makeTranscriptDbFromGFF() ## From GenomicFeatures, much like my own gff2df()
gff_from_txdb <- GenomicFeatures::asGFF(pombe)
## why is GeneID: getting prefixed to the IDs!?
gff_from_txdb$ID <- gsub(x = gff_from_txdb$ID, pattern = "GeneID:", replacement = "")
written_gff <- rtracklayer::export.gff3(gff_from_txdb, con = "pombe.gff")
## Warning in .local(object, con, format, ...): The phase information is missing. The written file will contain CDS
## with no phase information.
summary(updown_genes)
## Length Class Mode
## up_genes 6 data.frame list
## down_genes 6 data.frame list
test_genes <- updown_genes[["down_genes"]]
rownames(test_genes) <- paste0(rownames(test_genes), ".1")
lengths[["ID"]] <- paste0(lengths[["ID"]], ".1")
goseq_result <- sm(simple_goseq(sig_genes = test_genes, go_db = pombe_goids,
length_db = lengths))
head(goseq_result[["all_data"]])
## category over_represented_pvalue under_represented_pvalue numDEInCat
## 344 GO:0005634 2.969e-43 1 99
## 351 GO:0005730 1.054e-31 1 33
## 137 GO:0003674 1.573e-27 1 75
## 1236 GO:0042254 1.017e-23 1 21
## 461 GO:0006364 2.634e-23 1 20
## 353 GO:0005737 3.298e-22 1 71
## numInCat term ontology qvalue
## 344 576 nucleus CC 5.143e-40
## 351 58 nucleolus CC 9.131e-29
## 137 514 molecular_function MF 9.083e-25
## 1236 30 ribosome biogenesis BP 4.402e-21
## 461 25 rRNA processing BP 9.125e-21
## 353 535 cytoplasm CC 9.520e-20
goseq_result[["pvalue_plots"]][["mfp_plot_over"]]
goseq_result[["pvalue_plots"]][["bpp_plot_over"]]
test_genes <- updown_genes[["up_genes"]]
rownames(test_genes) <- paste0(rownames(test_genes), ".1")
goseq_result <- sm(simple_goseq(sig_genes = test_genes, go_db = pombe_goids,
length_db = lengths))
head(goseq_result[["all_data"]])
## category over_represented_pvalue under_represented_pvalue numDEInCat
## 643 GO:0008150 4.739e-48 1 116
## 384 GO:0005829 7.894e-47 1 120
## 353 GO:0005737 2.788e-46 1 118
## 344 GO:0005634 1.047e-45 1 121
## 137 GO:0003674 4.076e-42 1 109
## 854 GO:0016020 9.518e-40 1 97
## numInCat term ontology qvalue
## 643 517 biological_process BP 8.207e-45
## 384 551 cytosol CC 6.836e-44
## 353 535 cytoplasm CC 1.610e-43
## 344 576 nucleus CC 4.532e-43
## 137 514 molecular_function MF 1.412e-39
## 854 415 membrane CC 2.747e-37
goseq_result[["pvalue_plots"]][["mfp_plot_over"]]
goseq_result[["pvalue_plots"]][["bpp_plot_over"]]
clusterProfiler really prefers an orgdb instance to use, which is probably smart, as they are pretty nice. Sadly, there is no pre-defined orgdb for pombe…
## holy crap makeOrgPackageFromNCBI is slow, no slower than some of mine, so who am I to complain.
if (! "org.Spombe.eg.db" %in% installed.packages()) {
orgdb <- AnnotationForge::makeOrgPackageFromNCBI(
version = "0.1", author = "atb <abelew@gmail.com>",
maintainer = "atb <abelew@gmail.com>", tax_id = "4896",
genus = "Schizosaccharomyces", species = "pombe")
## This created the directory 'org.spombe.eg.db'
devtools::install_local("org.Spombe.eg.db")
}
library(org.Spombe.eg.db)
## Don't forget to remove the terminal .1 from the gene names...
## If you do forget this, it will fail for no easily visible reason until you remember
## this and get really mad at yourself.
rownames(test_genes) <- gsub(pattern = ".1$", replacement = "", x = rownames(test_genes))
pombe_goids[["ID"]] <- gsub(pattern = ".1$", replacement = "", x = pombe_goids[["ID"]])
cp_result <- simple_clusterprofiler(sig_genes = test_genes, do_david = FALSE, do_gsea = FALSE,
de_table = all_combined$data[[1]],
orgdb = org.Spombe.eg.db, orgdb_to = "ALIAS")
cp_result[["pvalue_plots"]][["ego_all_mf"]]
## Yay bar plots!
## Get rid of those stupid terminal .1s.
rownames(test_genes) <- gsub(pattern = ".1$", replacement = "", x = rownames(test_genes))
pombe_goids[["ID"]] <- gsub(pattern = ".1$", replacement = "", x = pombe_goids[["ID"]])
tp_result <- sm(simple_topgo(sig_genes = test_genes, go_db = pombe_goids, pval_column = "limma_adjp"))
tp_result[["pvalue_plots"]][["mfp_plot_over"]]
tp_result[["pvalue_plots"]][["bpp_plot_over"]]
## Get rid of those stupid terminal .1s.
##rownames(test_genes) <- gsub(pattern = ".1$", replacement = "", x = rownames(test_genes))
pombe_goids[["ID"]] <- gsub(pattern = ".1$", replacement = "", x = pombe_goids[["ID"]])
## universe_merge is the column in the final data frame when.
## gff_type is the field in the gff file providing the id, this may be redundant with
## universe merge, that is something to check on...
gst_result <- sm(simple_gostats(sig_genes = test_genes, go_db = pombe_goids, universe_merge = "id",
gff_type = "gene",
gff = "pombe.gff", pval_column = "limma_adjp"))
gst_result[["pvalue_plots"]][["mfp_plot_over"]]
gst_result[["pvalue_plots"]][["bpp_plot_over"]]
pander::pander(sessionInfo())
R version 4.0.3 (2020-10-10)
Platform: x86_64-pc-linux-gnu (64-bit)
locale: LC_CTYPE=en_US.UTF-8, LC_NUMERIC=C, LC_TIME=en_US.UTF-8, LC_COLLATE=en_US.UTF-8, LC_MONETARY=en_US.UTF-8, LC_MESSAGES=en_US.UTF-8, LC_PAPER=en_US.UTF-8, LC_NAME=C, LC_ADDRESS=C, LC_TELEPHONE=C, LC_MEASUREMENT=en_US.UTF-8 and LC_IDENTIFICATION=C
attached base packages: splines, stats4, parallel, stats, graphics, grDevices, utils, datasets, methods and base
other attached packages: GO.db(v.3.12.1), AnnotationDbi(v.1.52.0), GOstats(v.2.56.0), edgeR(v.3.32.1), lme4(v.1.1-26), Matrix(v.1.3-2), BiocParallel(v.1.24.1), variancePartition(v.1.20.0), fission(v.1.10.0), ruv(v.0.9.7.1), SummarizedExperiment(v.1.20.0), GenomicRanges(v.1.42.0), GenomeInfoDb(v.1.26.2), IRanges(v.2.24.1), S4Vectors(v.0.28.1), MatrixGenerics(v.1.2.1), matrixStats(v.0.58.0), hpgltools(v.1.0), R6(v.2.5.0), Biobase(v.2.50.0) and BiocGenerics(v.0.36.0)
loaded via a namespace (and not attached): utf8(v.1.1.4), R.utils(v.2.10.1), tidyselect(v.1.1.0), RSQLite(v.2.2.3), htmlwidgets(v.1.5.3), grid(v.4.0.3), Rtsne(v.0.15), IHW(v.1.18.0), munsell(v.0.5.0), codetools(v.0.2-18), preprocessCore(v.1.52.1), statmod(v.1.4.35), withr(v.2.4.1), colorspace(v.2.0-0), Category(v.2.56.0), highr(v.0.8), knitr(v.1.31), rstudioapi(v.0.13), Vennerable(v.3.1.0.9000), robustbase(v.0.93-7), genoPlotR(v.0.8.11), labeling(v.0.4.2), slam(v.0.1-48), GenomeInfoDbData(v.1.2.4), lpsymphony(v.1.18.0), topGO(v.2.42.0), bit64(v.4.0.5), farver(v.2.0.3), rprojroot(v.2.0.2), vctrs(v.0.3.6), generics(v.0.1.0), xfun(v.0.21), BiocFileCache(v.1.14.0), fastcluster(v.1.1.25), doParallel(v.1.0.16), locfit(v.1.5-9.4), bitops(v.1.0-6), cachem(v.1.0.4), DelayedArray(v.0.16.1), assertthat(v.0.2.1), scales(v.1.1.1), gtable(v.0.3.0), sva(v.3.38.0), rlang(v.0.4.10), genefilter(v.1.72.1), rtracklayer(v.1.50.0), lazyeval(v.0.2.2), selectr(v.0.4-2), broom(v.0.7.5), yaml(v.2.2.1), reshape2(v.1.4.4), GenomicFeatures(v.1.42.1), crosstalk(v.1.1.1), backports(v.1.2.1), qvalue(v.2.22.0), RBGL(v.1.66.0), tools(v.4.0.3), ggplot2(v.3.3.3), ellipsis(v.0.3.1), gplots(v.3.1.1), jquerylib(v.0.1.3), RColorBrewer(v.1.1-2), blockmodeling(v.1.0.0), Rcpp(v.1.0.6), plyr(v.1.8.6), progress(v.1.2.2), zlibbioc(v.1.36.0), purrr(v.0.3.4), RCurl(v.1.98-1.2), BiasedUrn(v.1.07), ps(v.1.5.0), prettyunits(v.1.1.1), openssl(v.1.4.3), ggrepel(v.0.9.1), colorRamps(v.2.3), magrittr(v.2.0.1), data.table(v.1.14.0), openxlsx(v.4.2.3), SparseM(v.1.81), goseq(v.1.42.0), pkgload(v.1.2.0), hms(v.1.0.0), evaluate(v.0.14), xtable(v.1.8-4), pbkrtest(v.0.5-0.1), XML(v.3.99-0.5), gridExtra(v.2.3), testthat(v.3.0.2), compiler(v.4.0.3), biomaRt(v.2.46.3), tibble(v.3.0.6), KernSmooth(v.2.23-18), crayon(v.1.4.1), minqa(v.1.2.4), R.oo(v.1.24.0), htmltools(v.0.5.1.1), mgcv(v.1.8-34), corpcor(v.1.6.9), tidyr(v.1.1.2), geneplotter(v.1.68.0), DBI(v.1.1.1), geneLenDataBase(v.1.26.0), dbplyr(v.2.1.0), MASS(v.7.3-53.1), rappdirs(v.0.3.3), boot(v.1.3-27), ade4(v.1.7-16), readr(v.1.4.0), cli(v.2.3.1), quadprog(v.1.5-8), R.methodsS3(v.1.8.1), pkgconfig(v.2.0.3), GenomicAlignments(v.1.26.0), plotly(v.4.9.3), xml2(v.1.3.2), foreach(v.1.5.1), annotate(v.1.68.0), bslib(v.0.2.4), XVector(v.0.30.0), AnnotationForge(v.1.32.0), rvest(v.0.3.6), EBSeq(v.1.30.0), stringr(v.1.4.0), digest(v.0.6.27), graph(v.1.68.0), Biostrings(v.2.58.0), rmarkdown(v.2.7), GSEABase(v.1.52.1), directlabels(v.2021.1.13), curl(v.4.3), Rsamtools(v.2.6.0), gtools(v.3.8.2), nloptr(v.1.2.2.2), lifecycle(v.1.0.0), nlme(v.3.1-152), jsonlite(v.1.7.2), desc(v.1.2.0), viridisLite(v.0.3.0), askpass(v.1.1), limma(v.3.46.0), fansi(v.0.4.2), pillar(v.1.5.0), lattice(v.0.20-41), fastmap(v.1.1.0), httr(v.1.4.2), DEoptimR(v.1.0-8), survival(v.3.2-7), glue(v.1.4.2), zip(v.2.1.1), fdrtool(v.1.2.16), iterators(v.1.0.13), Rgraphviz(v.2.34.0), pander(v.0.6.3), bit(v.4.0.4), stringi(v.1.5.3), sass(v.0.3.1), blob(v.1.2.1), DESeq2(v.1.30.1), caTools(v.1.18.1), memoise(v.2.0.0) and dplyr(v.1.0.4)